# This script extracts data from SSA ASR HTML Table 60, saves output as CSV

library(rvest)
library(tidyverse)
library(tools)

# List html files, table 60
html_files <- list.files("html", recursive = TRUE) %>% 
  str_subset(paste0(".*(\\/SSDI_ASR_60_).*"))

# Function to extract data from HTML file, save as CSV
table60_html_to_csv <- function(html_file) {
  # html_file <- html_files[1]
  
  # Age group or single year of age for each file
  age <- html_file %>% 
    str_extract("(?<=\\/SSDI_ASR_60_).*(?=[.]html)")
  
  # Parse HTML table
  x <- read_html(file.path("html", html_file)) %>% 
    html_table(header = FALSE, fill = TRUE) %>% 
    .[[1]] 
  
  # Rows corresponding to data for new group data
  panels <- which(x[,1]=="")
  group <- rep(x[panels, 2], diff(c(panels, nrow(x))) - 1)
  
  # Remove non-breaking spaces from group names
  showNonASCII(group)
  group <- str_replace_all(group, "[^\\x00-\\x7F]+", " ")
  stopifnot(length(showNonASCII(group)) == 0)
  
  # Rename columns
  colnames(x) <- c("Year",
                   "Total",
                   "Pending final decision",
                   "Technical denials",
                   "Medical decisions, Denials, Medical",
                   "Medical decisions, Denials, Subsequent nonmedical",
                   "Medical decisions, Allowances, Awards",
                   "Medical decisions, Allowances, Subsequent denials",
                   "Award rate (percent)",
                   "Allowance rate (percent)")
  
  # Select rows correponding to data; add table, group, and age info
  y <- x %>% 
    as_tibble() %>% 
    slice(-c(1:panels[1], panels[-1], nrow(x))) %>% 
    mutate(table = table) %>% 
    mutate(group = group) %>% 
    mutate(age = age) %>% 
    select(table, group, age, everything())
  
  # save as csv
  csv_file <- file.path("csv", paste0(file_path_sans_ext(html_file), ".csv"))
  dir.create(dirname(csv_file), showWarnings = FALSE, recursive = TRUE)
  write_csv(y, csv_file)
}

# Convert all HTML files to CSV
lapply(html_files, table60_html_to_csv)




# EOF
